Hitters <- Hitters
str(Hitters)
## 'data.frame': 322 obs. of 20 variables:
## $ AtBat : int 293 315 479 496 321 594 185 298 323 401 ...
## $ Hits : int 66 81 130 141 87 169 37 73 81 92 ...
## $ HmRun : int 1 7 18 20 10 4 1 0 6 17 ...
## $ Runs : int 30 24 66 65 39 74 23 24 26 49 ...
## $ RBI : int 29 38 72 78 42 51 8 24 32 66 ...
## $ Walks : int 14 39 76 37 30 35 21 7 8 65 ...
## $ Years : int 1 14 3 11 2 11 2 3 2 13 ...
## $ CAtBat : int 293 3449 1624 5628 396 4408 214 509 341 5206 ...
## $ CHits : int 66 835 457 1575 101 1133 42 108 86 1332 ...
## $ CHmRun : int 1 69 63 225 12 19 1 0 6 253 ...
## $ CRuns : int 30 321 224 828 48 501 30 41 32 784 ...
## $ CRBI : int 29 414 266 838 46 336 9 37 34 890 ...
## $ CWalks : int 14 375 263 354 33 194 24 12 8 866 ...
## $ League : Factor w/ 2 levels "A","N": 1 2 1 2 2 1 2 1 2 1 ...
## $ Division : Factor w/ 2 levels "E","W": 1 2 2 1 1 2 1 2 2 1 ...
## $ PutOuts : int 446 632 880 200 805 282 76 121 143 0 ...
## $ Assists : int 33 43 82 11 40 421 127 283 290 0 ...
## $ Errors : int 20 10 14 3 4 25 7 9 19 0 ...
## $ Salary : num NA 475 480 500 91.5 750 70 100 75 1100 ...
## $ NewLeague: Factor w/ 2 levels "A","N": 1 2 1 2 2 1 1 1 2 1 ...
sum(is.na(Hitters$Salary))
## [1] 59
Hitters <- na.omit(Hitters)
We will use the data included in the Hitters dataset to predict players’ Salary (our dependent variable) by means of PCR.
| Variable | Description | |
|---|---|---|
| AtBat | Number of times at bat in 1986 | |
| Hits | Number of hits in 1986 | |
| HmRun | Number of home runs in 1986 | |
| Runs | Number of runs in 1986 | |
| RBI | Number of runs batted in in 1986 | |
| Walks | Number of walks in 1986 | |
| Years | Number of years in the major leagues | |
| CAtBat | Number of times at bat during his career | |
| CHits | Number of hits during his career | |
| CHmRun | Number of home runs during his career | |
| CRuns | Number of runs during his career | |
| CRBI | Number of runs batted in during his career | |
| Cwalks | Number of walks during his career | |
| League | A factor with levels A and N indicating player’s league at the end of 1986 | |
| Division | A factor with levels E and W indicating player’s division at the end of 1986 | |
| PutOuts | Number of put outs in 1986 | |
| Assists | Number of assists in 1986 | |
| Errors | Number of errors in 1986 | |
| Salary | 1987 annual salary on opening day in thousands of dollars | |
| NewLeague | A factor with levels A and N indicating player’s league at the beginning of 1987 |
mat <- Hitters[,-19] # exclude salary
unique(mat$Division) #E W
## [1] W E
## Levels: E W
unique(mat$NewLeague) #A N
## [1] N A
## Levels: A N
unique(mat$League) #A N
## [1] N A
## Levels: A N
# encoding factors
mat$Division <- as.numeric(mat$Division=="E")
mat$NewLeague <- as.numeric(mat$NewLeague=="A")
mat$League <- as.numeric(mat$League=="A")
mat <- as.matrix(mat)
dim(mat) # Matrix of regressors
## [1] 263 19
KMO
KMO(mat)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = mat)
## Overall MSA = 0.71
## MSA for each item =
## AtBat Hits HmRun Runs RBI Walks Years CAtBat
## 0.78 0.65 0.64 0.69 0.77 0.70 0.94 0.78
## CHits CHmRun CRuns CRBI CWalks League Division PutOuts
## 0.64 0.67 0.69 0.72 0.75 0.55 0.40 0.86
## Assists Errors NewLeague
## 0.60 0.63 0.53
{stats} function prcomp().Note the center=T (default) and scale = T arguments
pca <- prcomp(mat, center = T, scale. = T)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.6981 2.0371 1.4249 1.24763 0.99933 0.90855 0.83027
## Proportion of Variance 0.3831 0.2184 0.1069 0.08193 0.05256 0.04345 0.03628
## Cumulative Proportion 0.3831 0.6016 0.7084 0.79034 0.84290 0.88635 0.92263
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.7163 0.5007 0.42990 0.37047 0.35704 0.30917 0.24706
## Proportion of Variance 0.0270 0.0132 0.00973 0.00722 0.00671 0.00503 0.00321
## Cumulative Proportion 0.9496 0.9628 0.97255 0.97978 0.98649 0.99152 0.99473
## PC15 PC16 PC17 PC18 PC19
## Standard deviation 0.22798 0.16735 0.11871 0.06973 0.03446
## Proportion of Variance 0.00274 0.00147 0.00074 0.00026 0.00006
## Cumulative Proportion 0.99747 0.99894 0.99968 0.99994 1.00000
names(pca) # see help for detailed information
## [1] "sdev" "rotation" "center" "scale" "x"
pca$rotation[,1:4]
## PC1 PC2 PC3 PC4
## AtBat 0.1982903511 0.38378403 -0.08862593 0.031967432
## Hits 0.1958612933 0.37727112 -0.07403226 0.017982351
## HmRun 0.2043689229 0.23713561 0.21618563 -0.235830772
## Runs 0.1983370917 0.37772134 0.01716642 -0.049941505
## RBI 0.2351738026 0.31453120 0.07308534 -0.138985255
## Walks 0.2089237517 0.22960610 -0.04563592 -0.130615462
## Years 0.2825754503 -0.26240195 -0.03458097 0.095311911
## CAtBat 0.3304629263 -0.19290382 -0.08357442 0.091114440
## CHits 0.3307416802 -0.18289883 -0.08625107 0.083751427
## CHmRun 0.3189794925 -0.12629732 0.08627233 -0.074277886
## CRuns 0.3382078595 -0.17227611 -0.05299565 0.069177419
## CRBI 0.3403428387 -0.16809208 -0.01499274 0.006673562
## CWalks 0.3168029362 -0.19231496 -0.04212050 0.030371431
## League 0.0544708722 0.09521324 0.54772520 0.396013097
## Division 0.0257252900 0.03667957 -0.01620121 -0.042746862
## PutOuts 0.0776971752 0.15573663 -0.05127418 -0.287591484
## Assists -0.0008416413 0.16865189 -0.39787109 0.524053054
## Errors -0.0078593695 0.20075992 -0.38285041 0.421917204
## NewLeague 0.0419103083 0.07758356 0.54458172 0.417718410
# Eigenvalues in the first four components are > 1.
pca$sdev
## [1] 2.69809294 2.03710687 1.42492394 1.24762925 0.99932745 0.90854598
## [7] 0.83026536 0.71626082 0.50073259 0.42990363 0.37046570 0.35704307
## [13] 0.30917060 0.24705633 0.22798243 0.16734805 0.11871224 0.06973092
## [19] 0.03445714
(shown for principal components 1 - 10)
round(t(pca$rotation[,1:10]) %*% pca$rotation[,1:10], 5)
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
## PC1 1 0 0 0 0 0 0 0 0 0
## PC2 0 1 0 0 0 0 0 0 0 0
## PC3 0 0 1 0 0 0 0 0 0 0
## PC4 0 0 0 1 0 0 0 0 0 0
## PC5 0 0 0 0 1 0 0 0 0 0
## PC6 0 0 0 0 0 1 0 0 0 0
## PC7 0 0 0 0 0 0 1 0 0 0
## PC8 0 0 0 0 0 0 0 1 0 0
## PC9 0 0 0 0 0 0 0 0 1 0
## PC10 0 0 0 0 0 0 0 0 0 1
screeplot(pca, type="lines")
abline(h=1, col="red")
pca$x[1:10,1:4]
## PC1 PC2 PC3 PC4
## -Alan Ashby -0.009630358 -1.8669625 -1.2627377 -0.93370088
## -Alvin Davis 0.410650757 2.4247988 0.9074630 -0.26370961
## -Andre Dawson 3.460224766 -0.8243753 -0.5544124 -1.61364990
## -Andres Galarraga -2.553449083 0.2305443 -0.5186536 -2.17210952
## -Alfredo Griffin 1.025746581 1.5705427 -1.3288484 3.48735458
## -Al Newman -3.973081710 -1.5044104 0.1551832 0.36913641
## -Argenis Salazar -3.445150319 -0.5988471 0.6252834 1.99597066
## -Andres Thomas -3.425848614 -0.1133262 -1.9959449 0.76635168
## -Andre Thornton 3.892286472 -1.9441629 1.8170103 -0.02666265
## -Alan Trammell 3.168770232 2.3878127 -0.7929565 2.56411717
gdta <- data.frame(Salary=Hitters$Salary,PC1=pca$x[,1],PC2=pca$x[,2])
gdta$Name <- row.names(Hitters)
ggplotly(
ggplot(gdta)+
geom_point(aes(PC1,PC2,color=Salary))+
scale_colour_gradient(low="red",high="green")+
xlab("PC1")+
ylab("PC2"),
tooltip = c("Salary")
)
# 3D plot of the same data
# plot_ly(x=pca$x[,1], y=pca$x[,2], z=Hitters$Salary,type="scatter3d", mode="markers", color=Hitters$Salary)
We may simply do OLS on the PCA-generated principal components:
summary(lm(Hitters$Salary ~ pca$x[,1:4]))
##
## Call:
## lm(formula = Hitters$Salary ~ pca$x[, 1:4])
##
## Residuals:
## Min 1Q Median 3Q Max
## -905.49 -173.80 -24.63 115.75 2163.36
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 535.926 21.122 25.373 <2e-16 ***
## pca$x[, 1:4]PC1 106.571 7.843 13.587 <2e-16 ***
## pca$x[, 1:4]PC2 21.645 10.388 2.084 0.0382 *
## pca$x[, 1:4]PC3 -24.341 14.852 -1.639 0.1024
## pca$x[, 1:4]PC4 -37.056 16.962 -2.185 0.0298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 342.5 on 258 degrees of freedom
## Multiple R-squared: 0.4322, Adjusted R-squared: 0.4234
## F-statistic: 49.1 on 4 and 258 DF, p-value: < 2.2e-16
e.g. pcr() from the {pls} package.
pcr.fit <- pcr(Salary~., data=Hitters, scale=TRUE,validation="CV")
pcr.fit
## Principal component regression , fitted with the singular value decomposition algorithm.
## Cross-validated using 10 random segments.
## Call:
## pcr(formula = Salary ~ ., data = Hitters, scale = TRUE, validation = "CV")
summary(pcr.fit)
## Data: X dimension: 263 19
## Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 19
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 452 351.9 353.2 355.0 352.8 348.4 343.6
## adjCV 452 351.6 352.7 354.4 352.1 347.6 342.7
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 345.5 347.7 349.6 351.4 352.1 353.5 358.2
## adjCV 344.7 346.7 348.5 350.1 350.7 352.0 356.5
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps
## CV 349.7 349.4 339.9 341.6 339.2 339.6
## adjCV 348.0 347.7 338.2 339.7 337.2 337.6
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 38.31 60.16 70.84 79.03 84.29 88.63 92.26 94.96
## Salary 40.63 41.58 42.17 43.22 44.90 46.48 46.69 46.75
## 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
## X 96.28 97.26 97.98 98.65 99.15 99.47 99.75
## Salary 46.86 47.76 47.82 47.85 48.10 50.40 50.55
## 16 comps 17 comps 18 comps 19 comps
## X 99.89 99.97 99.99 100.00
## Salary 53.01 53.85 54.61 54.61
pcr.cv <- crossval(pcr.fit, segments = 10)
plot(MSEP(pcr.cv), legendpos="topright")
summary(pcr.cv, what = "validation")
## Data: X dimension: 263 19
## Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 19
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 452 351.5 350.5 349.1 347.0 342.7 340.4
## adjCV 452 351.2 350.2 348.8 346.6 342.4 339.8
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 341.2 343.3 343.8 344.8 346.6 347.6 351.3
## adjCV 340.5 342.5 343.0 343.7 345.4 346.4 350.0
## 14 comps 15 comps 16 comps 17 comps 18 comps 19 comps
## CV 343.3 344.4 336.9 341.3 339.2 340.8
## adjCV 341.9 342.9 335.4 339.4 337.3 338.7
selectNcomp(pcr.fit, method = "onesigma", plot = TRUE)
## [1] 1
ncomp = 1pcr.fit.1 <- pcr(Salary~., data=Hitters, scale=TRUE, ncomp=1)
summary(pcr.fit.1)
## Data: X dimension: 263 19
## Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 1
## TRAINING: % variance explained
## 1 comps
## X 38.31
## Salary 40.63
loadings(pcr.fit.1)[,1, drop=F] # compare with: pca$rotation[,1:4] ...above
## Comp 1
## AtBat 0.1982903511
## Hits 0.1958612933
## HmRun 0.2043689229
## Runs 0.1983370917
## RBI 0.2351738026
## Walks 0.2089237517
## Years 0.2825754503
## CAtBat 0.3304629263
## CHits 0.3307416802
## CHmRun 0.3189794925
## CRuns 0.3382078595
## CRBI 0.3403428387
## CWalks 0.3168029362
## LeagueN -0.0544708722
## DivisionW -0.0257252900
## PutOuts 0.0776971752
## Assists -0.0008416413
## Errors -0.0078593695
## NewLeagueN -0.0419103083
ncomp = 4(although CV suggests only one PC for regression)
pcr.fit.4 <- pcr(Salary~., data=Hitters, scale=TRUE, ncomp=4)
summary(pcr.fit.4)
## Data: X dimension: 263 19
## Y dimension: 263 1
## Fit method: svdpc
## Number of components considered: 4
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps
## X 38.31 60.16 70.84 79.03
## Salary 40.63 41.58 42.17 43.22
loadings(pcr.fit.4)[,1:4] # compare with: pca$rotation[,1:4] ...above
## Comp 1 Comp 2 Comp 3 Comp 4
## AtBat 0.1982903511 0.38378403 -0.08862593 0.031967432
## Hits 0.1958612933 0.37727112 -0.07403226 0.017982351
## HmRun 0.2043689229 0.23713561 0.21618563 -0.235830772
## Runs 0.1983370917 0.37772134 0.01716642 -0.049941505
## RBI 0.2351738026 0.31453120 0.07308534 -0.138985255
## Walks 0.2089237517 0.22960610 -0.04563592 -0.130615462
## Years 0.2825754503 -0.26240195 -0.03458097 0.095311911
## CAtBat 0.3304629263 -0.19290382 -0.08357442 0.091114440
## CHits 0.3307416802 -0.18289883 -0.08625107 0.083751427
## CHmRun 0.3189794925 -0.12629732 0.08627233 -0.074277886
## CRuns 0.3382078595 -0.17227611 -0.05299565 0.069177419
## CRBI 0.3403428387 -0.16809208 -0.01499274 0.006673562
## CWalks 0.3168029362 -0.19231496 -0.04212050 0.030371431
## LeagueN -0.0544708722 -0.09521324 -0.54772520 -0.396013097
## DivisionW -0.0257252900 -0.03667957 0.01620121 0.042746862
## PutOuts 0.0776971752 0.15573663 -0.05127418 -0.287591484
## Assists -0.0008416413 0.16865189 -0.39787109 0.524053054
## Errors -0.0078593695 0.20075992 -0.38285041 0.421917204
## NewLeagueN -0.0419103083 -0.07758356 -0.54458172 -0.417718410
pcr.predictions <- predict(pcr.fit, Hitters[1:100,], ncomp=1, type="response")
head(cbind(Hitters$Salary[1:100], pcr.predictions))
## pcr.predictions
## [1,] 475.0 534.8996
## [2,] 480.0 579.6895
## [3,] 500.0 904.6869
## [4,] 91.5 263.8013
## [5,] 750.0 645.2411
## [6,] 70.0 112.5090
predplot(pcr.fit, ncomp = 1, newdata = Hitters[1:100,], asp = 1, line = TRUE)
# You can see that the benefit from adding 3 additional components is ver small:
predplot(pcr.fit, ncomp = 4, newdata = Hitters[1:100,], asp = 1, line = TRUE)
To assess PCR vs OLS prediction efficiency, we shall split “Hitters” data.frame into a train sample (model estimation) and test sample (to calculate and compare Salary predictions) at random, approx. 50/50.
set.seed(1)
train <- sample(c(TRUE,FALSE), nrow(Hitters),rep=TRUE)
test <- (!train)
ols.fit <- lm(Salary~., data=Hitters, subset=train)
summary(ols.fit)
##
## Call:
## lm(formula = Salary ~ ., data = Hitters, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -678.60 -172.97 -9.98 125.73 994.87
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.08756 121.48976 0.330 0.74203
## AtBat -1.29331 0.84598 -1.529 0.12909
## Hits 6.21164 3.15297 1.970 0.05125 .
## HmRun -10.00428 7.85328 -1.274 0.20529
## Runs -1.17911 3.69173 -0.319 0.75001
## RBI -0.48366 3.34330 -0.145 0.88523
## Walks 8.37439 2.52940 3.311 0.00125 **
## Years -7.72545 17.52402 -0.441 0.66016
## CAtBat -0.12788 0.19539 -0.655 0.51410
## CHits -0.36166 0.92766 -0.390 0.69736
## CHmRun -1.14781 1.98931 -0.577 0.56508
## CRuns 2.12285 0.92928 2.284 0.02420 *
## CRBI 1.19588 0.92003 1.300 0.19628
## CWalks -0.95610 0.43287 -2.209 0.02919 *
## LeagueN 31.92253 132.71737 0.241 0.81035
## DivisionW -111.09365 52.12002 -2.131 0.03520 *
## PutOuts 0.21107 0.09224 2.288 0.02396 *
## Assists -0.20505 0.27916 -0.735 0.46413
## Errors 1.38852 5.60344 0.248 0.80474
## NewLeagueN 43.89201 129.99978 0.338 0.73626
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 279.8 on 114 degrees of freedom
## Multiple R-squared: 0.6658, Adjusted R-squared: 0.6101
## F-statistic: 11.95 on 19 and 114 DF, p-value: < 2.2e-16
ols.fitted <- predict(ols.fit, newdata=Hitters[test==T,])
MSE.ols <- mean((ols.fitted - Hitters$Salary[test==T])^2)
set.seed(10)
pcr.fit2 <- pcr(Salary~., data=Hitters, subset=train, scale=TRUE, validation="CV")
pcr.pred2.1.comp <- predict(pcr.fit2, Hitters[test==T, ], ncomp=1, type="response")
MSE.pcr.1.c <- mean((pcr.pred2.1.comp - Hitters$Salary[test==T])^2)
pcr.pred2.4.comp <- predict(pcr.fit2, Hitters[test==T, ], ncomp=4, type="response")
MSE.pcr.4.c <- mean((pcr.pred2.4.comp - Hitters$Salary[test==T])^2)
MSE.ols # Test MSE for the OLS model
## [1] 142238.2
MSE.pcr.1.c # Test MSE for the PCR model, 1 component
## [1] 135538.1
MSE.pcr.4.c # Test MSE for the PCR model, 4 components
## [1] 132247
Note: The train sample / test sample setup as shown here is arbitrary and potentially not-representative. Better evaluation/comparison can be performed through k-Fold Cross Validation.